# --------------------------------------------------------
# ' this file combines preprocessed CNES 1992, ANES 2000, ANES 2008, TESS 2016 file 
# ' then merge with close election state indicators across four elections 
# ' and state-level political advertisement data
# --------------------------------------------------------

library(rio)
library(data.table)
library(maps)
library(stringr)

here = '~/Dropbox/project-archive/BG-bk/dataverse'

source(file.path(here,'code','rdata_utils.R'))

# --------------------------------------------------------
# read data 
cnes1992 <- import(file.path(here,"data","processed","cnes_1992.dta"))
anes2000 <- import(file.path(here,"data","processed","anes_2000.dta"))
anes2008 <- import(file.path(here,"data","processed","anes_2008.dta"))
tess2016 <- import(file.path(here,"data","processed","tess_2016.dta"))

# read dyadic data sets 
dyad1992 <- import(file.path(here,"data","processed","dyad_1992.dta"))
dyad2000 <- import(file.path(here,"data","processed","dyad_2000.dta"))
dyad2008 <- import(file.path(here,"data","processed","dyad_2008.dta"))

name.data.set <- c("CNES1992","ANES2000","ANES2008","TESS2016")

# generate binary measures for talking frequency 
anes2000$r_talk_polnone = ifelse(anes2000$r_talk_politics == 0, 1, 0)
anes2008$r_Talk_polnone = ifelse(anes2008$r_talk_freq == 0, 1, 0)
tess2016$r_talk_polnone = ifelse(tess2016$r_pol_talk == 4, 1, 0)

#------------------------------------------------------------------------------
# read state election results 
# and then measure close election indicators
#==============================================================================
state.election <- read.csv(file.path(here,'data','rawdata',"state_election_all.csv"),stringsAsFactors=FALSE)

state.full <- state.election[,1]
state.abb <- state.election[,2]

b <- strsplit(names(state.election),"_")
year <- unlist(lapply(b,function(x) x[2]))
year[1] <- NA 

c <- unlist(lapply(b,function(x) x[1]))
d <-strsplit(c,"[.]")

democratic <- unlist(lapply(d,function(x) sum(x %in% "Democratic")))
republican <- unlist(lapply(d,function(x) sum(x %in% "Republican")))
category <- unlist(lapply(d,function(x) x[length(x)]))

list.year <- unique(year) 
list.year <- list.year[2:length(list.year)]

list.data <- list()
list.year <- c("2016","2012","2008","2004","2000","1996","1992","1988")

for (yy  in list.year){

	yy.data <- state.election[,grep(yy,names(state.election))] 
	
	out.data <- array(NA,dim=c(length(state.full),4))
	total <- yy.data[,grep("Total",names(yy.data))]
	rep <- yy.data[,grep("Republican",names(yy.data))]
	dem <- yy.data[,grep("Democratic",names(yy.data))]
	total.d <- rep+dem 
	p.rep <- rep / total.d 
	p.dem <- dem / total.d

	out.data[,1] <- as.numeric(yy)
	out.data[,2] <- rep 
	out.data[,3] <- dem 
	out.data[,4] <- total
	list.data[[match(yy, list.year)]] <- out.data
}

state_close <- as.data.frame(do.call(rbind,list.data))
state_close$state <- rep(state.abb,8)

colnames(state_close) <- c("year","rep","dem","total",'state')

# ---- calculate election closeness
state_close$abs_diff <- abs(state_close$rep-state_close$dem)/abs(state_close$rep+state_close$dem)
state_close$close <- ifelse(state_close$abs_diff<0.05,1,0)

state_close = as.data.table(state_close)

state_fips = data.table(state.fips)
state_fips[, fips := str_pad(fips, 2, pad = "0")]
state_close = merge(x = state_close, y = unique(state_fips[,c("abb","fips")]), by.x = "state", by.y="abb",all.x=TRUE)

state_close[abs_diff < 0.05 & year == 2016,fips] # check fips code for close election states 

# add the number of close elections in three previous elections
setorder(state_close,state,year)

list_year = c(1992,2000,2008,2016)
list_cum_close = list()
mw = 2
for (yy in list_year) {
	moving_state_close = state_close[year >= yy - 4 * mw & year < yy, ]
	moving_state_close[,n_cumulative_close := cumsum(close), by=list(state)]
	moving_state_close = moving_state_close[year == yy - 4 * 1, c('state','n_cumulative_close')]
	moving_state_close[,year := yy]
	list_cum_close[[yy]] = moving_state_close
}

state_cum_close_prior = rbindlist(list_cum_close)

#------------------------------------------------------------------------------
# harmonize variable names across different data sets
#==============================================================================

admin.variables = c("id","year","state","wt","svydate2")
demographic.variables = c("r_age","r_female","r_race","r_educ","r_married","r_working")
political.variables = c("r_ideo", "r_pol_interest","r_vote",'r_talk_freq','r_talk_polnone',
	'r_partyid3a','r_partyid4a',
	'r_partyid3',"r_partyid7", "r_partyid4", 
	"r_bush","r_bush_a" 
	)
network_size.variables = c("n_size")

# --- dyadic level variables 
network_pid.variables = c("a_vote_bush","a_vote_bush_a","a_vote_dk","a_partyid7","a_partyid4","a_partyid_dk","a_partyid3","a_partyid3a","a_partyid4a")
network_tie.variables = c("order","a_relative","a_male","a_spouse","a_live_together",
	"a_same_partyid3","a_same_partyid4","a_same_partyid3a","a_same_partyid4a","a_party_homophily", "a_talkpol",
	"a_vote_same","a_vote_same_a")

all.variables = c(admin.variables, demographic.variables, political.variables, network_size.variables,
	network_pid.variables, network_tie.variables)

list.year <- c(1992,2000,2008,2016)
list.data <- list()
for (yy in list.year){
	if (yy == 1992){
		ind_data <- cnes1992
		dyad_data <- dyad1992 
	} else if (yy == 2000){
		ind_data <- anes2000
		dyad_data <- dyad2000
	} else if (yy == 2008){
		ind_data <- anes2008
		dyad_data <- dyad2008 
	} else if (yy == 2016){
		data <- tess2016 
		data <- data[,names(data)[names(data) %in% all.variables]]
	} 

	if (yy %in% c(2016)==FALSE){
		ind_data <- ind_data[,names(ind_data)[names(ind_data) %in% all.variables]]
		dyad_data <- dyad_data[,names(dyad_data)[names(dyad_data) %in% all.variables]]
		data <- merge(x=ind_data,y=dyad_data,by.x="id",by.y="id",all.x=TRUE,all.y=TRUE)

	}

	list.data[[match(yy,list.year)]] <- data 
}

all_data <- rbindlist(list.data,fill=TRUE)

all_data[, one := seq_len(.N), by=c("year", "id")]

# ---  merge with state-level data
all_data <- merge(x=all_data,y=state_close,by.x=c("state","year"),by.y=c("state","year"),all.x=TRUE)

# --- also merge with lagged state election results 
state_close_lag <- state_close 
names(state_close_lag) <- paste0("lag_",names(state_close_lag))

all_data$lag_year = all_data$year - 4 
all_data <- merge(x=all_data,y=state_close_lag,by.x=c("state","lag_year"),by.y=c("lag_state","lag_year"),all.x=TRUE)

# --- merge with cumulative close election results
all_data = merge(all_data, y=state_cum_close_prior, by=c('state','year'), all.x=TRUE)

#------------------------------------------------------------------------------
# add political advertisement data
#==============================================================================
ad_all = data.table(import(file.path(here,'data','processed','political_ad_daily.dta')))
ad_all = ad_all[state != 'National Cable',]
ad_all = ad_all[state != 'US',]

ad_year = data.table(import(file.path(here,'data','processed','political_ad_year.dta')))
ad_year = ad_year[state != 'National Cable',]
ad_year = ad_year[state != 'US',]

names(ad_year) = paste0(names(ad_year),'_year')
names(ad_year)[1:2] = c('state','year')

all_data_ad = merge(x=all_data, y=ad_all, by.x=c('state','svydate2'),by.y=c('state','date'), all.x=TRUE)
all_data_ad = merge(x=all_data_ad, y=ad_year, by.x=c('state','year'),by.y=c('state','year'), all.x=TRUE)

#------------------------------------------------------------------------------
# additional data cleaning
#==============================================================================

# -- filtering 
all_data_ad = all_data_ad[state != "", ] # 7 cases missing ANES 2000 data

# -- modify some variables 
all_data_ad[year == 1992, a_talkpol := ifelse(a_talkpol >= 1, 1L, 0L)]
all_data_ad[(year == 2000 | year == 2008)&!is.na(a_male), a_talkpol := 1L]
all_data_ad[,r_party_intensity := abs(r_partyid7-4)]

# specify homophily variables : using alternative version?
all_data_ad[, a_homophily_4 := a_vote_same]
all_data_ad[is.na(a_homophily_4), a_homophily_4 := a_same_partyid4]

all_data_ad[, a_homophily_4a := a_vote_same]
all_data_ad[is.na(a_homophily_4a), a_homophily_4a := a_same_partyid4a]

# specify alternative variables [treat independent/something else as one]
all_data_ad[,r_bush_3 := recode(r_bush, "-1=0.5")]
all_data_ad[,a_vote_bush_3 := recode(a_vote_bush, "-1=0.5")]
all_data_ad[,a_partyid3 := recode(a_partyid4, "4=3")]
all_data_ad[,r_partyid3 := recode(r_partyid4, "4=3")]

all_data_ad[, a_homophily_3 := ifelse(r_bush_3 == a_vote_bush_3, 1, 0)]
all_data_ad[is.na(a_homophily_3), a_homophily_3 := ifelse(r_partyid3 == a_partyid3, 1, 0)]

all_data_ad[, a_homophily_3a := ifelse(r_bush_3 == a_vote_bush_3, 1, 0)]
all_data_ad[is.na(a_homophily_3a), a_homophily_3a := ifelse(r_partyid3a == a_partyid3a, 1, 0)]

# specify how much they talk to persons "in-the-middle"
all_data_ad[!is.na(a_vote_bush), a_middle := as.integer(a_vote_bush == -1)]
all_data_ad[!is.na(a_partyid4), a_middle := as.integer(a_partyid4 == 3)]

all_data_ad[!is.na(a_vote_bush), a_middle_3 := as.integer(a_vote_bush_3 == 0.5)]
all_data_ad[!is.na(a_partyid4), a_middle_3 := as.integer(a_partyid3 == 3)]

# specify cross-partisan discussion 
all_data_ad[, a_cross := as.integer(a_middle==0 & a_homophily_4==0)]

# specify the uncertainty of alter's political characteristics
all_data_ad[,a_uncertainty := a_vote_dk]
all_data_ad[is.na(a_uncertainty), a_uncertainty := a_partyid_dk]

# some irrelevant variables
all_data_ad[, a_same_sex := as.integer(a_male != r_female)]

# recode network size 
all_data_ad[,n_size4 := n_size]
all_data_ad[n_size > 4, n_size4 := 4]

all_data_ad[,isolation := ifelse(n_size==0, 1, 0)]

# election related variables 
all_data_ad[,close_election := ifelse(abs_diff < 0.05, 1L, 0L)]
all_data_ad[,close_election_5 := ifelse(abs_diff < 0.05, 1L, 0L)]
all_data_ad[,close_election_7 := ifelse(abs_diff < 0.07, 1L, 0L)]
all_data_ad[,close_election_10 := ifelse(abs_diff < 0.10, 1L, 0L)]

all_data_ad[,close_election_7_10 := ifelse(0.07 <= abs_diff & abs_diff < 0.10 ,1L, 0L)]
all_data_ad[,close_election_5_7 := ifelse(0.05 <= abs_diff & abs_diff < 0.07 ,1L, 0L)]
all_data_ad[,close_election_3_5 := ifelse(0.03 <= abs_diff & abs_diff < 0.05 ,1L, 0L)]
all_data_ad[,close_election_3 := ifelse(abs_diff < 0.03, 1L, 0L)]

# lagged measures 
all_data_ad[,lag_close_election := ifelse(lag_abs_diff < 0.05, 1L, 0L)]
all_data_ad[,lag_close_election_7_10 := ifelse(0.07 <= lag_abs_diff & lag_abs_diff < 0.10 ,1L, 0L)]
all_data_ad[,lag_close_election_5_7 := ifelse(0.05 <= lag_abs_diff & lag_abs_diff < 0.07 ,1L, 0L)]
all_data_ad[,lag_close_election_3_5 := ifelse(0.03 <= lag_abs_diff & lag_abs_diff < 0.05 ,1L, 0L)]
all_data_ad[,lag_close_election_3 := ifelse(lag_abs_diff < 0.03, 1L, 0L)]

all_data_ad[,lag_close_election_5  := ifelse(lag_abs_diff < 0.05, 1L, 0L)]
all_data_ad[,lag_close_election_7  := ifelse(lag_abs_diff < 0.07, 1L, 0L)]
all_data_ad[,lag_close_election_10 := ifelse(lag_abs_diff < 0.10, 1L, 0L)]

# spouse or live together 
all_data_ad[year == 2008,  a_spouse := a_live_together]

export(all_data_ad,file.path(here,'data','processed','CE_processed.dta'))


